Setup:

Library

NEON MAGs

NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>% 
  select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>% 
  rename(`Genome Name` = `Genome Name / Sample Name`) %>% 
  filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>% 
  filter(str_detect(`Genome Name`, 'WREF plot', negate = T)) 

NEON_metagenomes <- NEON_metagenomes %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")

NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>% 
  # remove -COMP from genomicsSampleID
  mutate_at("genomicsSampleID", str_replace, "-COMP", "") 

NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>% 
  left_join(NEON_metagenomes, by = "Sample Name") %>% 
  left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
  rename("label" = "Bin ID")

tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")

#Gammaproteobacteria
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria <- NEON_MAGs_metagenomes_chemistry %>%
  filter(str_detect(`Class`,"Gammaproteobacteria"))

#Steroidobacterales
NEON_MAGs_metagenomes_chemistry_Steroidobacterales<- NEON_MAGs_metagenomes_chemistry %>%
  filter(str_detect(`Order`,"Steroidobacterales"))
#Burkholderiales
NEON_MAGs_metagenomes_chemistry_Burkholderiales <- NEON_MAGs_metagenomes_chemistry %>%
  filter(str_detect(`Order`,"Burkholderiales"))
#Novel
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
  filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names



#Toolik
NEON_MAGs_metagenomes_chemistry_TOOL<- NEON_MAGs_metagenomes_chemistry %>% 
  filter(`Site ID.x` == "TOOL") %>%
  mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_TOOL_Novel <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
  filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
TOOL_MAGs_label <- NEON_MAGs_metagenomes_chemistry_TOOL$label
tree_bac_TOOL_MAGs <-drop.tip(tree_bac,tree_bac$tip.label[-match(TOOL_MAGs_label, tree_bac$tip.label)])

# Make a vector with the internal node lables
node_vector_bac_TOOL_MAGS = c(tree_bac_TOOL_MAGs$tip.label,tree_bac_TOOL_MAGs$node.label)

NEON_MAGs_metagenomes_chemistry_Gamma_noblank <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>% 
  rename("Orders" = "Order") %>% 
  rename("Phyla" = "Phylum") %>% 
  rename("AssemblyType" = "Assembly Type") %>% 
  rename("WaterpH" ="soilInWaterpH") %>% 
  rename("Temp" ="soilTemp") %>% 
  rename("BinCompleteness" = "Bin Completeness") %>% 
  rename("BinContamination" = "Bin Contamination") %>% 
  rename("TotalNumberofBases" = "Total Number of Bases") %>% 
  rename("EcosystemSubtype" = "Ecosystem Subtype") %>%
  rename("GeneCount" = "Gene Count") %>% 
  rename("GCassembled" = "GC  * assembled")



NEON_MAGs_metagenomes_chemistry_TOOL_noblank <- NEON_MAGs_metagenomes_chemistry_TOOL %>% 
  rename("Phyla" = "Phylum") %>% 
  rename("AssemblyType" = "Assembly Type") %>% 
  rename("WaterpH" ="soilInWaterpH") %>% 
  rename("Temp" ="soilTemp") %>% 
  rename("BinCompleteness" = "Bin Completeness") %>% 
  rename("BinContamination" = "Bin Contamination") %>% 
  rename("TotalNumberofBases" = "Total Number of Bases") %>% 
  rename("EcosystemSubtype" = "Ecosystem Subtype") %>%
  rename("GeneCount" = "Gene Count") %>% 
  rename("GCassembled" = "GC  * assembled")

NEON_MAGs_bact <- NEON_MAGs %>% 
  filter(Domain=="Bacteria")
NEON_MAGs_bact_ind <- NEON_MAGs %>% 
  filter(Domain=="Bacteria") %>% 
  filter(`Assembly Type`=="Individual")
NEON_MAGs_bact_co <- NEON_MAGs %>% 
  filter(Domain=="Bacteria") %>% 
  filter(`Assembly Type`=="Combined")

NEON_MAGs_bact_ind_Novel <- NEON_MAGs_bact_ind %>% 
  filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) )
NEON_MAGs_bact_co_Novel <- NEON_MAGs_bact_co %>% 
  filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) )

NEON_MAGs_metagenomes_chemistry_TOOL_Xanthomonadales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
  filter(str_detect(`Order`,"Xanthomonadales"))
NEON_MAGs_metagenomes_chemistry_TOOL_Burkholderiales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
  filter(str_detect(`Order`,"Burkholderiales"))
NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
  filter(str_detect(`Order`,"Steroidobacterales"))

NEON_MAGs_metagenomes_chemistry_Alaska <-NEON_MAGs_metagenomes_chemistry %>%
  filter(str_detect(Site.x,"Alaska"))

Tree Nodes

# Make a vector with the internal node labels
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)

# Search for your Phylum, dont sort differently it will mess up nodes
#NEON
phylumss <-NEON_MAGs_metagenomes_chemistry %>%
            count(Phylum, sort=TRUE)
n=1
while (n!=29) {
  if (length(grep(phylumss[n,1], node_vector_bac, value = TRUE))==2) {
       phylumss[n,3] <-match(grep(phylumss[n,1], node_vector_bac, value = TRUE), node_vector_bac)[2]
       }
else {
  phylumss[n,3] <-match(grep(phylumss[n,1], node_vector_bac, value = TRUE), node_vector_bac)[1]
}
  n=n+1
} 
# for some reason they didnt name phylum subpopulations the same way for each, so we have to correct for Desulfobacterota
# grep("Desulfobacterota", node_vector_bac, value = TRUE)
# match(grep("Desulfobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# match(grep("Desulfobacterota_B", node_vector_bac, value = TRUE), node_vector_bac)
phylumss[16,3] <- match(grep(phylumss[16,1], node_vector_bac, value = TRUE), node_vector_bac)[1]

phylumss <-phylumss %>%
    arrange(desc(`...3`))
colortest <-viridis(29)
n=1
while (n!=29) {
  phylumss[n,4] <- colortest[n]
  n=n+1
} 

#Gamma
tree_bac_node_Gammaproteobacteria <- Preorder(tree_bac)
tree_Gammaproteobacteria <- Subtree(tree_bac_node_Gammaproteobacteria, 3048)

# grep("Thermoproteota", node_vector_bac, value = TRUE)
# match(grep("Thermoproteota", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Actinomycetota", node_vector_bac, value = TRUE)
# match(grep("Actinomycetota", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Desulfobacterota", node_vector_bac, value = TRUE)
# match(grep("Desulfobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# match(grep("Desulfobacterota_B", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Bacteroidota", node_vector_bac, value = TRUE)
# match(grep("Bacteroidota", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Verrucomicrobiota", node_vector_bac, value = TRUE)
# match(grep("Verrucomicrobiota", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Chloroflexota", node_vector_bac, value = TRUE)
# match(grep("Chloroflexota", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Eremiobacterota", node_vector_bac, value = TRUE)
# match(grep("Eremiobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Patescibacteria", node_vector_bac, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac, value = TRUE), node_vector_bac)
# 
# grep("Pseudomonadota", node_vector_bac, value = TRUE)
# match(grep("Pseudomonadota", node_vector_bac, value = TRUE), node_vector_bac)

# grep("Phycisphaerae", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Phycisphaerae", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Acidobacteriota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Acidobacteriota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Actinomycetota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Actinomycetota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Myxococcota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Myxococcota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Bacteroidota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Bacteroidota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Verrucomicrobiota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Verrucomicrobiota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Chloroflexota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Chloroflexota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Eremiobacterota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Eremiobacterota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# 
# grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)

# grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)

# grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)

Sankey Setup

Plot theme

# theme_classic(axis.text.x = element_text(color="grey20", size = 12,angle = 90, hjust = 0.5, vjust = 0.5), 
#                             axis.text.y = element_text(color = "grey20", size = 12), text=element_text(size = 16))

John_theme <-theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black", face='italic'),axis.title.x=element_text(size = 15),
                   axis.text.y=element_text(color="black"),axis.title.y=element_text(size = 15))

Abstract:

Introduction:

    Carbon emissions from industrial activity has led to numerous changes to the global climate that threaten the ecosystems humanity depends on for industrial agriculture. Rising temperatures has caused the melting of glaciers and permafrost releasing bacterial species that have been dormant for millennium (source). Additionally, The higher green house gas atmospheric content has lead to the acidification of both ocean and ground water (source). The changing climate has also lead to species migration. With the recent passing of the 1.5 Celsius average global temperature milestone set by ORG, it is imperative that society adapt to our changing world.

    In the face of climate and antibiotic challenges, plants have developed symbiotic relationships with bacteria. PLANT BACTERIA BIOCONTROL EX. PLANT NITROGEN RELATION. Thus, it has been proposed that humanity’s crops could be better insulated to ecological changes by exploiting these relationships. While several beneficial bacterial species have identified, the vast majority of the bacterial kingdom remains sequenced. Additionally, with their ability to rapidly evolve in the face of ecological challenges, new species with more robust tolerances to climate change influences will only grow with time. Thus, soil bacterium represent a vast untapped resource of climate change resistant proteins, biocontrol agents, and nitrogen fixators. Data collection effort by organizations like the National Ecological Observatory Network, provide a valuable genomic resource for phylogenetic analyses to determine the identities potential beneficial bacteria as well as monitoring the population changes caused by a changing climate.

 

    This study’s genomic data set was collected from soil samples by the National Ecological Observatory Network (NEON) from locations across the United States in GOLD Study ID Gs0161344. There were 1710 total bacterial MAGs with 22% being individual or combined assemblies of novel species of bacteria. To make analyses more feasible, this report will only comment on two data subsets, MAGs belonging to the class Gammaproteobacteria, and MAGs belonging found at Toolik Field Station, Alaska USA.

 

Figure 1:

Figure 1: Uprooted maximum-likelyhood phylogentic tree of gammaproteobactria based on 120 concatenated single copy proteins sequences from 780 reference genomes (Liao et al. 2020). Figure adapted from Figure 2 of Liao et al.
Figure 1: Uprooted maximum-likelyhood phylogentic tree of gammaproteobactria based on 120 concatenated single copy proteins sequences from 780 reference genomes (Liao et al. 2020). Figure adapted from Figure 2 of Liao et al.

 

    The class Gammaproteobacteria, under the phylum Pseudomondata, is made up of around 381 genera that thrive in marine, terrestial, and eukaryotic host ecosystems (Liao et al. 2020). Historically, this class has be defined phylogenetically by 16s rRNA sequence homology (Williams and Kelly 2013). Some notable members of this class include Escherichia coli, Vibrio fischeri, and Pseudomonas aeruginosa. INSERT SOIL EXAMPLES. This class has great diversity of morphologies with rod, cocci, spirilla, and filaments all represented (Williams et al. 2010). Additionally, species in class display a variety of trophisms including chemoautotrophs and photoautotrophs (Gao, Mohan, and Gupta 2009).

 

Figure 2:

Figure 2: Toolik Field Station, Alaska USA (University of Alaska Fairbanks n.d.)
Figure 2: Toolik Field Station, Alaska USA (University of Alaska Fairbanks n.d.)

 

    Located 400 miles north from Fairbanks, Alaska at the foot of the Brooks mountain range, biodiversity at Toolik Field Station is heavily influenced by its harsh winters where temperatures can reach -50⁰F. It is home to a variety of fauna including caribou, loons, voles, and polar bears. Located above the northern tree line, the vegetation in the tundra here mainly consists of birch, willow, sedges and grass. The site contains a large range of soil conditions, including layers of permafrost, created by glacial action (NEON 2023).

    This study examines the genomic content and environmental conditions of bacteria found at the Toolik Field station to help establish a reference population for future comparisons of bacterial population changes.

Methods:

Data Processing:

    Microbial samples analyzed in this study were collected from soil samples taken from NEON observation sites across the United States and sequenced via high throughput Illumina sequencing. Sequence results were then processed and annotated by the DOE JGI Metagenome Workflow for its inclusion in the Integrated Microbial Genomes and Microbiomes (IGM/M) Database and Joint Genomic Institute ’s Genomes Online Database (JGI GOLD). Briefly this workflow consists of the following steps: (1) Assembly of contigs and read alignment to assembled contigs. Contigs are additionally processed for quality control. (2) Feature prediction of coding and non-coding genes, as well as CRISPR sequences. (3) Functional annotation, in which predicted features are assigned identifiers based on sequence similarity. (4) Taxonomic annotation in which contig-level phylogenetic assignments are made based on functional annotations. (5) Binning by high- and medium-quality genome bins. Bins are additionally screened for contamination. A detailed explanation of the workflow can be found in Clum et al., ASM mSystems, 2021.

Figure Preparation:

    The figures of this study were formatted with the following packages in R: tidyverse,knitr, ggtree, TDbook #A Companion Package for the Book “Data Integration, Manipulation and Visualization of Phylogenetic Trees” by Guangchuang Yu (2022, ISBN:9781032233574). , ggimage, rphylopic, treeio, tidytree, ape, TreeTools, phytools, ggnewscale, ggtreeExtra, ggstar, DT (GGTREE SOURCES)

Results:

GOLD Study

Figure 3:

plottttttt <-ggtree(tree_bac, layout="circular", branch.length="none")
n=1
while (n!=29) {
  if (is.na(phylumss[n,3])) {
    
  } else {
  plottttttt <-plottttttt + geom_cladelab(node=as.integer(phylumss[n,3]), label=as.character(phylumss[n,1]),size=10, align=TRUE, angle='auto', offset.text=1, textcolor=phylumss[n,4] ,barsize=1.5, fontsize=5, barcolor=as.character(phylumss[n,4]))+geom_hilight(node=as.integer(phylumss[n,3]), fill=as.character(phylumss[n,4], alpha=.6))
  }
  n=n+1
}
plottttttt

Figure 3: Phylogenetic Tree of all bacterial MAGs. Tree constructed from samples collected in GOLD Study ID Gs0161344 by the National Ecological Observatory Network.

  ### Figure 4:

knitr::include_url("data/lab14/sankey-NEON_MAG_ind_pavian.txt.html")

Figure 4: Sankey plot of all NEON Individual assembly MAGs.

 

Figure 5:

NEON_MAGs_bact_ind %>%
  ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=`Site ID`))+geom_bar(position="dodge")+coord_flip()+labs(x="Phylum", y="NEON MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'),axis.title.y=element_text(size = 15), legend.title=element_text(size=15), legend.text=element_text(size=10))+ scale_y_continuous(limits=c(0,100),breaks = c(0,10,20,30,40,50,60,70,80,90,100))                                                                                                                                 

Figure 5: NEON MAG distribution by phylum. NEON site distribution of MAGs collected in GOLD Study ID Gs0161344 organized by phylum.  

    The three phyla with the lionshare of bacteria found in this study were Actinomycetota, Pseusdomonadota, and Acidobacteriota (Fig. 3,4,5). National Grasslands LBJ, Texas USA (CLBJ) accounted for the greatest portion of bacteria(Fig. 5). A majority of phyla found in this study contained MAGs collected in at least two locations but quite a few of phyla with few MAGs were found in a single location. For example, MAGs belonging to the Desulfobacterota, Myxococcota_A, Eisenbacteria, Krumholzibacteriota, and Nitrospirota phyla were only found in Chase Lake Wetlands, North Dakota, USA (Fig. 5).

Figure 6:

NEON_MAGs_bact_ind_Novel %>% 
  ggplot(aes(x=fct_rev(fct_infreq(`Site ID`)), fill=`Site ID`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(x="Site ID", y="Total Novel Bacteria (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black"))

Figure 6: Novel Bacteria MAG NEON Site Distribution. Novel Bacteria were determined from MAGs constructed from individual assemblies. Novel indicates the MAGs could not be placed in an existing group at the species, genus or family level.  

  ### Figure 7:

NEON_MAGs_bact_ind %>% 
  filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
  ggplot(aes(x=fct_rev(`Phylum`), fill=`Phylum`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(x="Phylum", y="Novel Species MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'))+scale_y_continuous(limits = c(0,150), breaks = c(0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150))

Figure 7: Novel bacteria predominately found in the phylum Actinomycetota. Novel bacteria were determined from MAGs constructed from individual assemblies in GOLD Study ID Gs0161344. Novel indicates the MAGs could not be placed in an existing group at the species, genus or family level.  

    There were 243 and 129 novel species of bacteria annotated in GOLD Study ID Gs0161344’s individual and combined assembly MAGs, respectively. The majority of the novel individual assembly MAGs were found in the phylum Actinomycetota (Fig. 7). Novel individual assembly MAGs were spread out over all NEON collection sites with the top three sites for novel individual assemblies being National Grasslands LBJ, Texas USA (CLBJ), Great Basin, Onaqui, Utah USA (ONAQ), Konza Prairie Bio Station, Kansas USA (KONZ) (Fig.6)

Figure 8:

NEON_MAGs_metagenomes_chemistry_Bacteria <-NEON_MAGs_metagenomes_chemistry %>%
  filter(Domain=="Bacteria") %>% 
  mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) 

NEON_MAGs_metagenomes_chemistry_Bacteria %>%
  ggplot(aes(x=`Phylum`,y=`Genome Size (Kbp)`,color=`Phylum`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="A", x="Phylum", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Bacteria, aes(x=`Gene Count`, y=`Genome Size (Kbp)`, color=`Phylum`))+geom_point()+labs(title="B", x="Gene Count (n)", y="")+scale_y_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+scale_x_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+theme_classic()+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black"), legend.title=element_text(size=15),legend.text=element_text(size=10),title=element_text(size=20))

Figure 8: Overall linear trend in gene count vs genome size for MAGs found across NEON sites. A) Boxplot of MAG genome size (kbp) for all bacterial phyla. B) Scatter plot of MAG Gene count vs genome size (kbp) for all bacterial MAGs. MAGs are colored by phylum. All samples found in GOLD Study ID Gs0161344.

 

    Terrestrial bacteria are known to have large genomes encoding thousands genes. This is due in larger part to the diverse environment they are exposed to. Their larger genomes allow for the expression of multiple metabolic phenotypes that allow them to adapt to environmental challenges. NEON samples analyzed in this study had a broad spread of genome sizes with the minimum genome and maximum genomes sizes being 753 from the phylum Chloroflexota and 12,584 kbp from the phylum Actinomycetota, respectively (Fig. 8). There was a linear relationship between gene count and genome for all NEON samples, with a rough 1,000 bp per gene ratio (Fig. 8B).

 

Gammaproteobacteria Samples

Figure 9:

ggtree(tree_bac, layout="circular", branch.length="none") +
    geom_hilight(node=as.integer(phylumss[1,3]), fill=as.character(phylumss[1,4], alpha=.6)) +
    geom_cladelab(node=as.integer(phylumss[1,3]), label=as.character(phylumss[1,1]),size=10, align=TRUE, angle='auto', offset.text=1, textcolor=phylumss[1,4] ,barsize=1.5, fontsize=5, barcolor=as.character(phylumss[1,4]))+
    geom_hilight(node=3048, fill="steelblue", alpha=.6) +
    geom_cladelab(node=3048, label="Gammaproteobacteria", align=TRUE, angle='auto', offset=1,  
                  offset.text=0.5 , textcolor='steelblue', barcolor='steelblue',barsize=1.5, fontsize=5)

Figure 9: Phylogenetic tree of all bacterial MAGs. Tree contails all bacterial samples collected in GOLD Study ID Gs0161344 by the National Ecological Observatory Network with the Gammaproteobacteria class of Pseudomondata highlighted in blue.

Figure 10:

knitr::include_url("data/lab14/sankey-NEON_MAG_ind_Gpro(2).html")

Figure 10: Sankey plot of individual assembly Gammmaproteobacteria MAGs All samples found in GOLD Study ID Gs0161344.  

Figure 11:

Figure 11: Phylogenetic Tree of Gammaproteobacteria. Marker size is based on total number of bases in MAG. Markers colored by ecosystem subtype.Tree includes MAGs in GOLD Study ID Gs0161344 filtered to those annotated as belonging to the class Gammaproteobacteria.

 

Figure 12:

NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
  ggplot(aes(x=Family))+geom_bar(aes(fill=Family),position=position_dodge2(width=0.9, preserve="single"),show.legend=FALSE)+coord_flip()+facet_wrap(vars(Order), scales="free_y", ncol=4)+labs(x="Family", y="MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'), legend.title=element_text(size=15),legend.text=element_text(size=10),strip.text = element_text(size=15, face = "italic"))+scale_y_continuous(limits=c(0,75), breaks=c(0,15,30,45,60))

Figure 12: Distribution of Gammaproteobacteria MAGs by order. Barplot of Gammaproteobacteria families organized into panels by order. Barplot includes MAG reads from all NEON sites in GOLD Study ID Gs0161344 filtered to those annotated as belonging to the class Gammaproteobacteria.

 

Figure 13:

NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
  mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) %>%
  ggplot(aes(x=`Order`,y=`Genome Size (Kbp)`,color=`Order`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(x="Order", y="Genome Size (Kbp)")+John_theme

Figure 13: Genome Size (kbp) of Gammaproteobacteria Members between 2500-5000 Kbp with large variation in Burkholderiales. Boxplots includes MAG reads from all NEON sites filtered to those annotated as belonging to the class Gammaproteobacteria.  

    NEON MAGs assigned to the Gammaproteobacteria class were found in all ecosystem subtypes (Fig. 11). Unlike the larger NEON data set, the distribution of genome size of Gammaproteobacteria members was fairly narrow with members averaging between 2000 and 5000 kbp (Fig. 13). With the exception of Burkholderiales,Steroidobacterales, and Xanthomonadales, this was largely due to the fewer MAGs in each order(Fig.12). The vast majority of Gammaproteobacteria annotated in this study were novel species with only two bacteria belonging to the Xanthomonadales order assigned as Stenotrophomonas stenotrophomonas sp024519465. Steroidobacterales had by far the most annontated members, while Burkholderiales had the most family member groups (Fig. 10,12).
 

Figure 14:

Figure 14: Ecological Conditions of Gammaproteobacteria Samples. A) Scatterplot of sample soil temperature vs Gammaproteobacteria families found in GOLD Study ID Gs0161344. Points are colored by order group. B) Scatterplot of sample soil pH in water vs Gammaproteobacteria families found in GOLD Study ID Gs0161344. Points are colored by order group. C) Scatterplot of sample National Land Cover Database Vegetation Type vs Gammaproteobacteria families found in GOLD Study ID Gs0161344. Points are colored by order group. D) Scatterplot of sample ecosystem subtype vs Gammaproteobacteria families found in GOLD Study ID Gs0161344. Points are colored by order group.  

    Members of Gammaproteobacteria were found in a variety of ecosystem conditions. Gammabacteria were found in temperatures spanning 2-28 Celsius (Fig. 14A). The soil pH of all Gammaproteobacteria samples was largely neutral to mildly acidic with lowest pH around 4 (Fig. 14B). Overall, the vegetation class of sedge and grassland herbaceous and contained the least amount of family groups in Gammproteobacteria (Fig. 14C). This is not too surprising for sedge herbaceous as this vegetation class is only found in one Alaskan NEON site (Sup. Fig. 1). However, grassland herbaceous vegetation is present in 5 out of the 13 sample sites (Sup. Fig 2). Interestingly, despite being the Gammaproteobacteria order with the most MAGs and existing in a variety of vegetation classes, soil pH and temperatures, no members of the Steroidobacterales order were found in tropical forest or desert ecosystems sampled in this study(Fig. 14). These ecosystem subtypes correspond to NEON sites in Puerto Rico and Arizona, USA, respectively (Sup.Fig 2).

 

Figure 15:

NEON_MAGs_metagenomes_chemistry_Steroidobacterales <-NEON_MAGs_metagenomes_chemistry_Steroidobacterales  %>%
  mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
#Ecosubtype
NEON_MAGs_metagenomes_chemistry_Steroidobacterales  %>%
  ggplot(aes(x=Genus, y=`Ecosystem Subtype`,color=Genus))+geom_point(show.legend=FALSE)+labs(title="A", x="", y="Ecosystem Subtype")+theme_classic()+John_theme+theme(title=element_text(size=20))+
#siteID  
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales,aes(x=Genus, `Site ID.x`,color=Genus))+geom_point(show.legend=FALSE)+labs(title="B", x="", y="Site ID")+theme_classic()+John_theme+theme(title=element_text(size=20))+
#genomesize  
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales, aes(x=`Genus`,y=`Genome Size (Kbp)`,color=Genus))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="C", x="Genus", y="Genome Size (kbp)")+John_theme+theme(title=element_text(size=20))+
#genus count
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales,aes(x=Genus,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="D", x="Genus", y="MAGs (n)")+theme_classic()+scale_y_continuous(limits=c(0,50), breaks=c(0,10,20,30,40,50))+John_theme+theme(title=element_text(size=20))

Figure 15: Steroidobacteraceae members are found in a variet of ecosystems but are overwhelmimg represented by the genus Bog-1198. A) Dotplot of ecosystem subtype each genus of Steroidobacteraceae was found in. B) Dotplot of the NEON sites each genus of Steroidobacteraceae was found in. C) Boxplot of MAG genome sizes (kbp) for Steroidobacteraceae genera. D) Barplot of Steroidobacteraceae genera MAGs. All samples belong to the GOLD Study ID Gs0161344.

  ### Figure 16: Figure 16: Steroidobacteraceae BOG-1198 is found predominately in the northern USA A) Barchart of BOG-1198 MAGs found in NEON ecosystem subtypes. B) Barchart of BOG-1198 MAGs found at NEON sites. All samples belong to the GOLD Study ID Gs0161344.

 

    Members in the order Steriobacterales were further examined to see if their high population is correlated with genome size or sample location. Only the Steroidobacteraceae family was found under the order. This family contain 7 genera with the genus BOG-1198 accounting for 30 of the 50 of Steriobacterales MAGs (figs. 15A). Once again the distribution of genome size appears correlated to the total MAG counts for each genera in the Steroidobacteraceae family, with the higher populations corresponding to larger deviation in genome size (Fig. 15C,D). Given that BOG-1198* accounted for a majority of Steriobacterales MAGs, the genus was further examined. Members of this genus were found in several ecosystem subtypes and sample sites located in the northern United States (Fig. 16). The majority individual Assemblies of Steroidobacteraceae BOG-1198 MAGs were found at one of the three Alaskan sample sites (Fig. 16B). Note that all combined assembly MAGs were given the shrubland ecosystem subtype. A third of all BOG-1198 MAGs were combined assemblies.

 

Figure 17:

NEON_MAGs_metagenomes_chemistry_Burkholderiales <-NEON_MAGs_metagenomes_chemistry_Burkholderiales  %>%
  mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))

NEON_MAGs_metagenomes_chemistry_Burkholderiales  %>%
  ggplot(aes(x=Genus, y=`Ecosystem Subtype`,color=`Family`))+geom_point(show.legend=FALSE)+labs(title="A", x="", y="Ecosystem Subtype")+theme_classic()+John_theme+theme(title=element_text(size=20))+
  
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales,aes(x=Genus,y= `Site ID.x`,color=`Family`))+geom_point()+labs(title="B", x="", y="Site ID")+theme_classic()+John_theme+theme(title=element_text(size=20), legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))+
  
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales, aes(x=`Genus`,y=`Genome Size (Kbp)`,color=`Family`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="C", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales,aes(x=Genus,fill=`Family`))+geom_bar()+labs(title="D", x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(title=element_text(size=20),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))

Figure 17: Burkholderiales members are found in a variety of ecosystems and have a broad distribution of genome size. A) Scatter plot of sample ecosystem subtype vs Burkholderiales genera. P B) Scatter plot of NEON site ID vs Burkholderiales genera. C) Box plot of Burkholderiales genera genome sizes (kbp) D) Bar plot of Burkholderiales genera MAGs. All plots colored by family group. All samples from GOLD Study ID Gs0161344.

  ### Figure 18:

NEON_MAGs_metagenomes_chemistry_Burkholderiaceae <- NEON_MAGs_metagenomes_chemistry_Burkholderiales  %>%
   filter(str_detect(Family,"Burkholderiaceae" ))

  
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiaceae, aes(x=`Ecosystem Subtype`,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="A", x="Ecosystem Subtype", y="MAGs (n)")+theme_classic()+theme(axis.title=element_text(size = 15),title=element_text(size=20), axis.text.y=element_text(color="black"), axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black"))+
  
  ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiaceae, aes(x=`Site ID.x`, fill=Genus))+geom_bar()+labs(title="B", x="Site ID", y="MAGs (n)")+theme_classic()+theme(axis.title=element_text(size = 15),axis.text.y=element_text(color="black"), axis.text.x=element_text(color="black", angle=45, vjust=1, hjust=1), title=element_text(size=20),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))

Figure 18: Genera of Burkholderiaceae are found in a variety of ecosystems and NEON locations. A) Barplot of Burkholderiaceae genera MAGs found in NEON ecosystem subtypes. B) Barplot of Burkholderiaceae genera MAGs found at NEON sites. Site labeled NA is for combined assembly MAGS. Bar colored by genus group. Bars colored as NA represent novel genera in Burkholderiales. All samples from GOLD Study ID Gs0161344.

 

    Members of the order Burkholderiales were also examined further at the genus level to determine if their broader diversity corresponded to the variety of ecosystems they were found in. Indeed, members of this order were found across the United States in several different ecosystem subtypes(Fig. 17A,B). Individually assembled members of the Burkholderiaceae including the genera Caballeronia, Herbaspirillum, and Paraburkholderi, were found mainly temperate forests with some members also found in tundra and and Boreal forest/Taiga subtypes (Fig. 17, 18A). Interestingly, genera genome size distribution appears not to be correlated to the amount genera members. Despite only having 4 MAGs from the Niwot Ridge site in Colorado the Herbaspirillum genera contained a broad range of genome sizes (fig. 17C). The Caballeronia genera, containing 5 MAGs, had a much tighter distribution of genome size (Fig. 17C). Also of note is the appearance of the Trinicki genera in the Wind River Experimental Forest in Washington (Fig. 17,18). Trinicki members have previously been found to form endosymbotic relations with the phyopathgenic fungi Rhizopus microsporus (Source).

 

Toolik Field Station Samples

Figure 19:

knitr::include_url("data/lab14/sankey-NEON_MAG_Toolik.html")

Figure 19: Sankey plot of all MAGs from Toolik Field Station, Alaska USA. Samples collected in GOLD Study ID Gs0161344.  

Figure 20:

ggtree(tree_bac_TOOL_MAGs, layout="circular", branch.length="none") +
    geom_hilight(node=258, fill="grey", alpha=.6) +
    geom_cladelab(node=258, label="Pseudomonadota", align=TRUE, angle='auto', 
                  offset.text=0.5 , textcolor='black', barcolor='grey',barsize=1.5, fontsize=5)+
    geom_hilight(node=259, fill="steelblue", alpha=.6) +
    geom_cladelab(node=259, label="Gammaproteobacteria", align=TRUE, angle='auto', offset=0.75,  
                  offset.text=0.5 , textcolor='black', barcolor='steelblue',barsize=1.5, fontsize=5)

Figure 20: Phylogenetic tree of all MAGs collected at Toolik Field Station, Alaska USA. Gammaproteobacteria class of Pseudomondata highlighted in blue. Samples collected in GOLD Study ID Gs0161344 by the National Ecological Observatory Network at Toolik Field Station, Alaska USA.

 

Figure 21:

Figure 21: MAGs found at Toolik Field Station dominated by Actinomycetota and Pseudomondota and Acidobacteriota members. Stacked barplot of bacterial Phlya found at Toolik Field Station, Alaska USA in GOLD Study ID Gs0161344. Bars are colored by Order groups.

      The majority of MAGs collected at Toolik Field Station belonged to the Actinomycelota and Pseudomonadota phyla (Fig. 20,21). Inside Actinomycelota, MAGs belonging to the order Thermoleophilia dominated (Fig. 21). The phylum* Pseudomonadota* was almost equally divided into Alpha and Gamma Proteobacteria (Fig. 21). The phylum Acidobacteriota was also frequently found at Toolik, with the vast majority of its MAGs belonging to the order Terriglobia (Fig. 21).  

Figure 22:

Figure 22: Toolik Field Station has some of the lowest sample temperatures but average sample pHs compared to other NEON sites. A) Scatterplot of sample soil temperatures at each NEON site. B) Scatterplot of sample soil pHs in water at each NEON site. Points colored by Site ID. All samples from GOLD Study ID Gs0161344.

 

Figure 23:

NEON_MAGs_metagenomes_chemistry_Alaska %>%
     ggplot(aes(x=`Phylum`,y=`soilTemp`, color=`Phylum`), size=1)+geom_point(show.legend=FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+scale_y_continuous(limits=c(0,10)) +labs(y="Soil Temperature (Celcius)", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=13))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

Figure 23: Similar soil temperature and phyla found at Alaskan NEON Sites Scatter plot of sample temperature in Celsius vs phyla at Alaskan NEON sites in GOLD Study ID Gs0161344. Left Panel: Caribou Creek Watershed, Alaska USA. Middle Panel: Healy, Denali National Park, Alaska USA. Right Panel: Toolik Field Station, Alaska USA. Points are colored by Phylum.

 

Figure 24:

NEON_MAGs_metagenomes_chemistry_Alaska %>%
     ggplot(aes(x=`Phylum`,y=`soilInWaterpH`, color=`Phylum`), size=1)+geom_point(show.legend = FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+scale_y_continuous(limits=c(0,14),breaks =rep(0:14)) +labs(y="Soil pH", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=13))

Figure 24: Similar soil pH and phyla found at Alaskan NEON Sites Scatter plot of sample soil pH in water vs phyla at Alaskan NEON sites in GOLD Study ID Gs0161344. Left Panel: Caribou Creek Watershed, Alaska USA. Middle Panel: Healy, Denali National Park, Alaska USA. Right Panel: Toolik Field Station, Alaska USA. Points are colored by Phylum.

 

Figure 25:

NEON_MAGs_metagenomes_chemistry_Alaska %>%
     ggplot(aes(x=`Phylum`,y=`nlcdClass`, color=`Phylum`), size=1)+geom_point(show.legend = FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+labs(y="Vegetation Class", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=10))

Figure 25: Vegetation Class differs but similar phyla found at Alaskan NEON Sites Scatter plot of National Land Class Vegetation Class vs Phyla at Alaskan NEON Sites in GOLD Study ID Gs0161344. Left Panel: Caribou Creek Watershed, Alaska USA. Middle Panel: Healy, Denali National Park, Alaska USA. Right Panel: Toolik Field Station, Alaska USA. Points are colored by Phylum.

 

Figure 26:

NEON_MAGs_metagenomes_chemistry_Alaska %>%
     ggplot(aes(x=`Phylum`, fill=`Order`))+geom_bar()+facet_wrap(vars(Site.x), scales="free_y", ncol=3) +labs(y="MAGs (n)", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=10), legend.title=element_text(size=15), legend.text=element_text(size=10, face='italic'))

Figure 26: Phyla MAGs distribution differs at each Alaskan NEON Site. Stacked barplot of Phyla at Alaskan NEON Sites in GOLD Study ID Gs0161344 . Left Panel: Caribou Creek Watershed, Alaska USA. Middle Panel: Healy, Denali National Park, Alaska USA. Right Panel: Toolik Field Station, Alaska USA. Bars are colored by Order group.

      Given the significant distance between the three Alaskan sites and the rest of the NEON collection sites, the ecological conditions of all Alaskan MAGs were first examined together to see if phylogenetic distribution was more correlated with ecological conditions than geographic position in North America. As expected, the three Alaskan sites had the lowest sample temperatures recorded in this study (Fig. 22A). Similarly, their soil pH was among the lowest in the study (Fig. 22B). However, when examining their phyla side by side at these conditions it became clear at the phyla level that the three sites had similar temperatures, pH, and phyla distribution (Fig 23,24). The Alaskan sites did vary in vegetation class with Toolik consisting of three scrub types, Healy of just dwarf Scrub, and Caribou Creek of forest and scrub. Toolik Field station had the most MAGs and phyla represented followed by Healy (Fig. 26), indicating that scrub vegetation classes may host greater bacterial populations in Alaska than forest.

  ### Figure 27:

NEON_MAGs_metagenomes_chemistry_TOOL %>%
  mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) %>%
  ggplot(aes(x=`Phylum`,y=`Genome Size (Kbp)`,color=`Phylum`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(x="Phylum", y="Genome Size (Kbp)")+John_theme

Figure 27: Genome Size Distributions at Toolik Field Station are inline with those of the wider Gold Study. Boxplot of Phyla genome sizes (kbp) found at Toolik Field Station, Alaska USA in GOLD Study ID Gs0161344.

 

Figure 28:

ggtree(tree_bac_TOOL_MAGs)  %<+%
  NEON_MAGs_metagenomes_chemistry + 
  geom_tippoint(aes(colour=`Phylum`)) + 

# For unknown reasons the following does not like blank spaces in the names
  geom_facet(panel = "Gene Count (n)", data = NEON_MAGs_metagenomes_chemistry_TOOL_noblank, geom = geom_point, 
      mapping=aes(x = GeneCount, color=Phyla))+
  geom_facet(panel = "% GC Content ", data = NEON_MAGs_metagenomes_chemistry_TOOL_noblank, geom = geom_col, 
                aes(x = GCassembled,fill=Phyla), orientation = 'y', width = .6, show.legend=FALSE) +
  theme_tree2(legend.position=c(.1, .7),strip.text=element_text(size=15),axis.text=element_text(color = 'black'))

Figure 28: Genome content of bacteria found at Toolik Field Station. Left Panel: Phylogenetic tree of all MAGs found at Toolik Field Station, Alaska USA with markers for MAG phylum. Middle Panel: Gene Count of all Mags found at Toolik Field Station, Alaska USA. Points colored by MAG phylum. Right Panel: % GC Content of all MAGs found at Toolik Field Station, Alaska USA. Bar colored by MAG phylum. All samples from in GOLD Study ID Gs0161344.

      The genome size of MAGs collected at Toolik Field Station reflected those of the overall GOLD study with the trend of greater representation correlating with greater variation in phyla genome size (Fig. 27,28). This was especially evident for the phyla Patescibacteria, Armatimonadota, Gemmatimonadota and Verrucomicrobiota which had low representation in Toolik MAGs (Fig.28). The genomes at Toolik had a spread of gene counts with Chloroflexota and Pseudomonadota species at the low and high end of the spectrum, respectively (Fig.28). This variation was not reflected in genome GC content with the average %GC around 60% (Fig.28).   ### Figure 29:

NEON_MAGs_metagenomes_chemistry_TOOL %>%
  filter(Class=="Gammaproteobacteria") %>%
  ggplot(aes(x=fct_rev(fct_infreq(Order)), fill=Family))+geom_bar()+coord_flip()+labs(x="Order", y="MAG Count (n)",fill="Family")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title=element_text(size = 15), axis.text.y=element_text(color="black",face='italic'), legend.title=element_text(size=15),legend.text=element_text(size=10))+scale_y_continuous(limits=c(0,50),breaks = c(0,10,20,30,40,50))

Figure 29: Gammabacteria at Toolik Field Station fall into Steroidobacterales or Burkholderiales families. Stacked Barplot of Gammaproteobacteria MAGs found at Toolik Field Station, Alaska USA in GOLD Study ID Gs0161344. Bars colored by family group.

 

Figure 30:

NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales %>%
ggplot( aes(x=`Genus`,y=`Genome Size (Kbp)`,color=Genus))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="A", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title = element_text(size=15))+
#genus count
  ggplot(data=NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales,aes(x=Genus,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="B", x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(title = element_text(size=15))

Figure 30: Distribution of Steroidobacterales genera at Toolik Field Station, Alaska USA. A) Boxplot of sample genome size (kbp) in the Steroidobacterales family at Toolik Field Station, Alaska USA. B) Barplot of MAGs for each genus in the Steroidobacterales family found at Toolik Field Station, Alaska USA. All samples from GOLD Study ID Gs0161344

 

Figure 31:

NEON_MAGs_metagenomes_chemistry_TOOL_Burkholderiales  %>%
  ggplot(aes(x=Genus,fill=`Family`))+geom_bar()+labs(x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(legend.title = element_text(size=15),legend.text = element_text(size=10, face='italic'))+scale_y_continuous(limits=c(0,5),breaks=rep(0:5))

Figure 31: Genus distribution of Burkholderiales is wide spread at Toolik Field Station, Alaska USA. Barplot of MAGs in Burkholderiales Genera at Toolik Field Station, Alaska USA. Bars colored by family group. NA represent novel genera in Burkholderiales. All samples from GOLD Study ID Gs0161344.  

    Only two orders of Gammaproteobacteria were found at Toolik Field Station. Similar to the overall GOLD study, the order Steroidobacterales had the most MAGs with the majority of them belonging to the BOG-1198 genus (Fig. 29,30). The order Burkholderiales had 6 families found at Toolik Field Station (Fig. 31). The genera Caballeronia, Herbaspirillum, Paraburkholderi, and Trinicki were not found at Toolik (Fig. 18B, 31).

Discussion:

Overall GOLD Study

Gammaproteobacteria

    As expected for a class with 381 genera, Gammaprotobacteria were found in every ecosystem subtype sampled in this study. Members like those in the order Steroidobacterales were found at nearly every soil temperature and pH sampled, indicating this class’s diversity allows for members to survive in a variety of environments.       Given their wide distribution across environment types, members of the Steroidobacterales order warrant further investigation as potential bacterial sources of climate change resistant proteins. Despite their bog naming convention, Steroidobacteraceae BOG-1198 had the largest representation in this order across multiple ecosystem subtypes, indicating this genus may contain members particularly adept at surviving a variety of environmental challenges. Additionally, the order’s underrepresentation in tropical forest and desert ecosystems is quite curious. These ecosystem subtypes are quite ecologically different from each other, but their unifying higher soil temperatures do not seem to be an issue for Steroidobacterales members at other NEON sites. Given that the tropical forest ecosystem is only found at the Puerto Rico NEON site, the absence of Steroidobacterales at that location may be related to its geographic isolation as an island.       Several notable genera in Burkholderiales relevant to bacterial biocontrol were found in this study, Caballeronia, Paraburkholderia, Trinickia. The genera Caballeronia and Paraburkhoderia have been reported to contain nitrogen fixating bacteria that may be useful for improving crop yields (SOURCE). Agricultural nitrogen sources usually come in the form of chemical fertilizer that can be washed into the local watershed by precipitation. Bacteria like those in the Caballeronia and Paraburkholderia genera, are being investigated for the possibility of increasing crop nitrogen resources through the augmentation of traditional fertilizing methods with their introduction to crop rhizospheres. However, these two genera were found in Caribou Creek Alaska and Yellowstone, Wyoming. Temperatures at these locations range from temperate to cold. While this study’s Caballeronia and Paraburkholderia MAGs may contain species useful for nitrogen fixation, they may also be vulnerable to increases soil temperature caused by climate change. The genera Trinickia has been reported to contain both plant growth promoting and pathogenic bacteria species (SOURCE). More investigation into the Trinickia population at Wind River, Washington in needed to determine if this study’s MAGs are potentially agriculturally beneficial.

Toolik Field Station

Code

Lab 8

Lab 9

Lab 10

Lab 12

Lab 13

Lab 14

Lab 15

Gammaproteobacteria Tree

Lab 16

Gammaproteobacteria Graphs

Toolik Graphs

Supplemental Figures:

Supplemental Figure 1: NEON Site vs Vegetation subclass

NEON_MAGs_metagenomes_chemistry %>%
  ggplot(aes(y=`Site.x`, x=nlcdClass))+geom_point()+labs(title="NEON Site Vegetation Classes", y="Site", x="Vegetation Class")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))

Supplemental Figure 2: NEON Site vs Ecosystem subclass

NEON_MAGs_metagenomes_chemistry %>%
  ggplot(aes(y=`Site.x`, x=`Ecosystem Subtype`))+geom_point()+labs(title="NEON Site Ecosystem Subtypes", y="Site", x="Ecosystem Subtype")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))

Supplemental Figure 3: NEON SITE vs Site ID

NEON_MAGs_metagenomes_chemistry %>%
  ggplot(aes(x=`Site ID.x`,y=Site.x))+geom_point()+labs("NEON Site IDs", y="Site",x="Site ID")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))

Supplemental Figure 4: Sankey plot of all NEON Combined assembly MAGs

knitr::include_url("data/lab14/sankey-NEON_MAG_co_pavian.txt.html")

Supplemental Figure 4: Sankey plot of all NEON combined assembly MAGs

Supplemental Figure 5: Sankey plot of all NEON combined assembly Gammmaproteobacteria MAGs

knitr::include_url("data/lab14/sankey-NEON_MAG_co_Gpro(1).html")

Supplemental Figure 5: Sankey plot of all NEON combined assembly Gammmaproteobacteria MAGs

Figure 5:hylogenetic tree of all MAGs belonging to the class Gammaproteobacteria with Site location

ggtree(tree_Gammaproteobacteria)  %<+%
  NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria + 
  geom_tippoint(aes(color=`Order`)) + 

# For unknown reasons the following does not like blank spaces in the names
  geom_facet(panel = "Gene Count", data = NEON_MAGs_metagenomes_chemistry_Gamma_noblank, geom = geom_point, 
      mapping=aes(x = GeneCount, color=Orders))+
  geom_facet(panel = "% GC Content ", data = NEON_MAGs_metagenomes_chemistry_Gamma_noblank, geom = geom_col, 
                aes(x = GCassembled,fill=Orders), orientation = 'y', width = .6, show.legend=FALSE) +
  theme_tree2(legend.position=c(.1, .7))

ggtree(tree_Gammaproteobacteria)  %<+%
  NEON_MAGs_metagenomes_chemistry + 
 xlim(0,20)+
  geom_point(mapping=aes(color=`Site.x`))

Figure 5: Phylogenetic tree of all MAGs belonging to the class Gammaproteobacteria with Gene count and %GC Content.

References:

GGtree Yu G (2022). Data Integration, Manipulation and Visualization of Phylogenetic Treess, 1st edition edition. Chapman and Hall/CRC. doi:10.1201/9781003279242, https://www.amazon.com/Integration-Manipulation-Visualization-Phylogenetic-Computational-ebook/dp/B0B5NLZR1Z/.

Xu S, Li L, Luo X, Chen M, Tang W, Zhan L, Dai Z, Tommy T. Lam, Guan Y, Yu G (2022). “Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data.” iMeta, 1(4), e56. doi:10.1002/imt2.56, https://onlinelibrary.wiley.com/doi/full/10.1002/imt2.56.

Yu G (2020). “Using ggtree to Visualize Data on Tree-Like Structures.” Current Protocols in Bioinformatics, 69(1), e96. doi:10.1002/cpbi.96, https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.96.

Yu G, Lam TT, Zhu H, Guan Y (2018). “Two methods for mapping and visualizing associated data on phylogeny using ggtree.” Molecular Biology and Evolution, 35, 3041-3043. doi:10.1093/molbev/msy194, https://academic.oup.com/mbe/article/35/12/3041/5142656.

Yu G, Smith D, Zhu H, Guan Y, Lam TT (2017). “ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data.” Methods in Ecology and Evolution, 8, 28-36. doi:10.1111/2041-210X.12628, http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12628/abstract.

gold Please cite: Supratim Mukherjee, Dimitri Stamatis, Cindy Tianqing Li, Galina Ovchinnikova, Jon Bertsch, Jagadish Chandrabose Sundaramurthi, Mahathi Kandimalla, Paul A. Nicolopoulos, Alessandro Favognano, I-Min A. Chen , Nikos C. Kyrpides and T.B.K. Reddy. Twenty-five years of Genomes OnLine Database (GOLD): data updates and new features in v.9. Nucl. Acids Res. (2022) doi: doi.org/10.1093/nar/gkac974

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8269246/

Gao, Beile, Ritu Mohan, and Radhey S. Gupta. 2009. “Phylogenomics and Protein Signatures Elucidating the Evolutionary Relationships Among the Gammaproteobacteria.” International Journal of Systematic and Evolutionary Microbiology 59 (2): 234–47. https://doi.org/10.1099/ijs.0.002741-0.
Liao, Hu, Xiaolan Lin, Yuqian Li, Mingming Qu, and Yun Tian. 2020. “Reclassification of the Taxonomic Framework of Orders Cellvibrionales, Oceanospirillales, Pseudomonadales, and Alteromonadales in Class Gammaproteobacteria Through Phylogenomic Tree Analysis.” mSystems 5 (5): 10.1128/msystems.00543–20. https://doi.org/10.1128/msystems.00543-20.
NEON, Collection. 2023. “Collection - Getting to Know the NEON Domains.” ArcGIS StoryMaps. https://storymaps.arcgis.com/collections/5765fc95c3c24297a5b9dc2c99e69e5c.
University of Alaska Fairbanks, Collection. n.d. “Photo Gallery Toolik Field Station.” Accessed April 2, 2024. https://www.uaf.edu/toolik/about/toolik-gallery.php.
Williams, Kelly P., Joseph J. Gillespie, Bruno W. S. Sobral, Eric K. Nordberg, Eric E. Snyder, Joshua M. Shallom, and Allan W. Dickerman. 2010. “Phylogeny of Gammaproteobacteria.” Journal of Bacteriology 192 (9): 2305–14. https://doi.org/10.1128/JB.01480-09.
Williams, Kelly P., and Donovan P. Kelly. 2013. “Proposal for a New Class Within the Phylum Proteobacteria, Acidithiobacillia Classis Nov., with the Type Order Acidithiobacillales, and Emended Description of the Class Gammaproteobacteria.” International Journal of Systematic and Evolutionary Microbiology 63 (Pt_8): 2901–6. https://doi.org/10.1099/ijs.0.049270-0.